import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.float_format = '{:.2f}'.format
import json
import plotly.figure_factory as ff
import plotly.express as px
import plotly
%matplotlib inline
data = pd.read_csv('test_data.csv',sep=';',dtype={'userId':object,'clientId':object}, parse_dates=['timestamp']).drop_duplicates()
data
data.info()
Here we are getting datetime from timestamp and selecting platform from messy userAgent field. also in dataset we get duplicates of rows but with na in field userId. For such drops we use custom groupby function. Such appoach kills productivity as any custom function.
data['datetime'] = pd.to_datetime(data['timestamp'],unit='ms')
data['date']=data['datetime'].dt.date
data= data.sort_values(['datetime','userId'])
platforms=[]
for platform in ["Windows","Linux","Macintosh","iPad","iPod","iPhone","PlayStation","Android"]:
data.loc[data.userAgent.apply(lambda x: platform in(x)), 'platform']=platform
data['platform']=data['platform'].fillna(0)
data =data.drop_duplicates(subset=['datetime','userId'])
Here we create few suplementary tables for futher data storage in database if neaded.
#clients table storages clients Id, first detection date and source
clients= data.groupby('clientId', as_index=False).apply(lambda x: x.sort_values(by='datetime')[:1]).reset_index(drop=True)[['clientId','datetime','source','medium','platform']].copy()
clients.rename({'datetime':'client_first_detection_datetime','source':'client_first_source','medium':'client_first_medium','platform':'client_first_platform'}, axis=1, inplace=True)
#clients.set_index('client_first_detection_datetime',drop=True, inplace=True)
clients.sample(5)
# clients_users storages library of clientId to userId
clients_users=data.dropna(subset=['userId']).groupby(['clientId','userId'], as_index=False).agg({'datetime':'min'}).set_index('datetime',drop=True).copy()
clients_users.sample(5)
data['prev_action_time_spent'] =data.groupby(['clientId'])['datetime'].diff(1)
data['prev_action_platform_same']=data.groupby(['clientId'])['platform'].shift(1)==data.groupby(['clientId'])['platform'].shift(0)
sns.set_style("darkgrid", {"axes.facecolor": ".9"})
fig, ax = plt.subplots(2,sharex=True, figsize=(20,7) )
fig.suptitle('Here we can visually get information about cumulutative time distribution in minutes between clics timedelta (from 1 minute to 1440 (1 day)). Here we can decide from which timedelta we should separate our data to visits. For example we here can set 3 hours which covers 75% of clics')
plt.xticks(np.concatenate((np.arange(0, 900, 30),np.arange(900, 1441, 60))))
sns.distplot(data.loc[(data['prev_action_time_spent']>=np.timedelta64(1, 'm'))&
(data['prev_action_time_spent']<=np.timedelta64(1, 'D'))]
['prev_action_time_spent'].astype('timedelta64[m]'), bins=48 ,kde_kws={'cumulative': False}, color="b",ax=ax[1])
sns.kdeplot(data.loc[(data['prev_action_time_spent']>=np.timedelta64(1, 'm'))&
(data['prev_action_time_spent']<=np.timedelta64(1, 'D'))]
['prev_action_time_spent'].astype('timedelta64[m]'),cumulative=True, shade=True, color="b",ax=ax[0])
fig = px.histogram(data.loc[(data['prev_action_time_spent']>=np.timedelta64(1, 'm'))&
(data['prev_action_time_spent']<=np.timedelta64(1, 'D'))]
['prev_action_time_spent'].astype('timedelta64[m]'), x="prev_action_time_spent",
marginal="box", histnorm='probability', # or violin, rug
title='Absolute time distribution in minutes between clics timedelta (from 1 minute to 1440 (1 day))')
fig.show()
Here we can decide from which timedelta we should separate our data to visits. For example we here can set 1 hour which covers 80% of clics
Without any additional information I decided to consider as new login every action on sites with time diffrence to previous from 1 hour. It covers 80% of all clicks during the day. Additionally we detect new actions from other platform as new login.
City was stored in the geoNetwork field in dictionary. such extraction is very memory utilizing and could easily crush session with out of memory eror thats why I have done it here.
#client_login_logs here we consider as new login every action on sites with time diffrence to previous from 1 hour
client_login_logs=data.groupby(['clientId','platform'], as_index=False).apply(lambda x:x.loc[(data['prev_action_time_spent']>=np.timedelta64(1, 'h'))|(data['prev_action_time_spent'].isnull())|(data['prev_action_platform_same']==False)])
client_login_logs=pd.concat([client_login_logs, client_login_logs["geoNetwork"].apply(lambda x : dict(eval(x)) ).apply(pd.Series)], axis=1)
client_login_logs=client_login_logs.loc[:,~client_login_logs.columns.duplicated()]
client_login_logs=client_login_logs[['clientId','country','city','latitude','longitude','date','datetime','platform']].sort_values('datetime').reset_index(drop=True)
#client_login_logs.set_index('datetime',drop=True, inplace=True)
#client_login_logs=client_login_logs.sort_values('datetime').reset_index(drop=True)
client_login_logs['session_num']=client_login_logs.groupby('clientId').cumcount()
client_login_logs.sample(5)
For data analysis and visualisation we connect all gathered data to basic dataframe. client_login_logs table has only datetime of session start. currently we got our set sorted by user and date so we can fill empty values with filling down last value.
data =data.merge(clients, how='left',on='clientId')
data =data.merge(client_login_logs[['clientId','country','city','date','datetime','platform','session_num']], how='left',on=['clientId','country','datetime','date','platform'])
data =data.sort_values(by=['clientId', 'session_num','platform'])
data[['session_num','city']]=data[['session_num','city']].fillna(method='ffill')
data['client_first_detection_date']=data['client_first_detection_datetime'].dt.date
data['time_after_aq']=data.datetime-data.client_first_detection_datetime
Here we are going to analyse clients retention.
retended_platform=client_login_logs[(client_login_logs['session_num']!=0)&(client_login_logs['platform']!=0)].groupby(['date','platform','session_num'], as_index=False).agg({'clientId': 'count'}).rename(columns={'clientId': 'retended_clients'})
first_platform=client_login_logs[(client_login_logs['session_num']==0)&(client_login_logs['platform']!=0)].groupby(['date','platform'], as_index=False).agg({'clientId': 'nunique'}).rename(columns={'clientId': 'new_clients'})
all_visits_platform=client_login_logs[(client_login_logs['platform']!=0)].groupby(['date','platform','session_num'], as_index=False).agg({'clientId': 'count'}).rename(columns={'clientId': 'all_clients'})
fig = px.bar(all_visits_platform.groupby(['date','platform'], as_index=False).sum(), x='date',
y='all_clients', color='platform', title="Absolute volume of visits by platform.")
fig.show()
fig = px.bar(first_platform, x='date',
y='new_clients', color='platform', title="Absolute volume of new visits by platform. Here we count first visit for all clients")
fig.show()
fig = px.bar(retended_platform.groupby(['date','platform'], as_index=False).sum(), x='date',
y='retended_clients', color='platform', title="Absolute volume of retended visits by platform. Here we count all visits exept first for all clients")
fig.show()
client_login_logs.info()
#We detect retention by first used platform for correct retention rates.
retention_pivot=client_login_logs.loc[(client_login_logs['session_num']<=10) &(client_login_logs['platform']!=0)
].merge(clients, on='clientId').pivot_table(
values='clientId', index=[ 'client_first_platform'],
columns=['session_num'], aggfunc=pd.Series.nunique
).reset_index()
retention_pivot[[ 'ret '+str(x)+' to 1st' for x in [*range(1,11,1)]]]=retention_pivot[[*range(1,11,1)]].div(retention_pivot[0], axis=0)
print('Here we can se retention rate from 1st retention till 10th vs first visit')
retention_pivot[['client_first_platform']+[ 'ret '+str(x)+' to 1st' for x in [*range(1,11,1)]]]
#retention_by_platform_detection_date give us retentions in break by platform.
#We detect retention by first used platform for correct retention rates.
retention_by_platform_detection_date= client_login_logs[(client_login_logs['session_num']==0)&(client_login_logs['platform']!=0)][['clientId','platform','date','datetime','session_num']].merge(client_login_logs[(client_login_logs['session_num']!=0)&(client_login_logs['platform']!=0)][['clientId','session_num']],on='clientId').groupby(['date','platform','session_num_y'],as_index=False).agg({'clientId':'nunique'}).rename(columns={'session_num_y':'session_num','clientId': 'retended_clients'})
retention_rate_by_platform_detection_date = first_platform.merge(retention_by_platform_detection_date, how='left', on=['platform','date']).fillna(0).rename(columns={'date': 'client_first_detection_date'})
retention_rate_by_platform_detection_date['retention_rate']=retention_rate_by_platform_detection_date.retended_clients/retention_rate_by_platform_detection_date.new_clients
retention_rate_by_platform_first_visits_detection_date=retention_rate_by_platform_detection_date.loc[(retention_rate_by_platform_detection_date['session_num']<=5)&(retention_rate_by_platform_detection_date['session_num']>=1)]
retention_rate_by_platform_first_visits_detection_date['session_num']=retention_rate_by_platform_first_visits_detection_date['session_num'].astype(str)
#retention rate by platform
retention_rate_by_platform=retention_rate_by_platform_detection_date[['platform','session_num','retended_clients']].groupby(['platform','session_num'],as_index=False).sum().merge( first_platform.groupby(['platform']).sum(),how='left',on='platform')
retention_rate_by_platform['retention_rate']=retention_rate_by_platform['retended_clients']/retention_rate_by_platform['new_clients']
retention_rate_by_platform_first_visits=retention_rate_by_platform.loc[(retention_rate_by_platform['session_num']<=5 )& (retention_rate_by_platform['session_num']>=1)]
retention_rate_by_platform_first_visits['session_num']=retention_rate_by_platform_first_visits['session_num'].astype(str)
fig = px.line(retention_rate_by_platform.loc[(retention_rate_by_platform['session_num']<=10)&(retention_rate_by_platform['session_num']>=1)],
x='session_num', y='retention_rate', color="platform", hover_data=['new_clients','retention_rate','session_num'], title="Retention rate by platform for first 10 visits vs 1st visit")
fig.update_xaxes(rangeslider_visible=True)
fig.show()
for x in [*range(0,10,1)]:
retention_pivot[ 'ret '+str(x+1)+' to prev']=(retention_pivot[x+1]/retention_pivot[x])
print("Here we can see that conversion after first 3 visits stabilises so we can say that after 3 visit firs platform has no matter for futher clients retentions")
retention_pivot[['client_first_platform']+[ 'ret '+str(x)+' to prev' for x in [*range(1,11,1)]]]
fig = px.line(retention_pivot[['client_first_platform']+[ 'ret '+str(x)+' to prev' for x in [*range(1,11,1)]]].melt(id_vars=['client_first_platform'],value_name='retention_rate'),
x='session_num', y='retention_rate', color="client_first_platform", hover_data=['retention_rate','session_num'], title="Retention rate by platform for first 10 visits vs previous visit")
fig.update_xaxes(rangeslider_visible=True)
fig.show()
for name, group in retention_rate_by_platform_first_visits_detection_date.groupby('platform'):
fig = px.bar(group, x='client_first_detection_date',
y='retention_rate', color='session_num', title="Retention rate of visits by detection date for "+name)
fig.show()